package wish.experimental;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;

import org.htmlparser.Parser;
import org.htmlparser.lexer.Lexer;
import org.htmlparser.lexer.Page;
import org.htmlparser.util.NodeIterator;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;

public class ParserDOM {

	/**
	 * @param args
	 * @throws IOException 
	 * @throws ParserException 
	 */
	public static void main(String[] args) throws IOException, ParserException {
		
		String htmlFile = "/home/pjbarrio/Documents/RelationExtractionStuff/DatabasesInterfaces/Data/67.html";
		
		String text = readFile(new File(htmlFile));
		
		Parser p = new Parser(new Lexer(new Page(text, "UTF-8")));
		
		NodeIterator elements = p.elements();
		
		while (elements.hasMoreNodes()){
			
			System.out.println("AA:" + elements.nextNode().getText());
			
		}
	}

	private static String readFile(File file) throws IOException {
		
		String ret = "";
		
		BufferedReader br = new BufferedReader(new FileReader(file));
		
		String line;
		
		ret = br.readLine();
		
		while((line = br.readLine())!=null){
			
			ret = ret + "\n" + line;
			
		}
		
		br.close();
		
		return ret;
	}

}
